{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**注意：**该notebook参考了 *\"Machine Learning in Action\"* 一书中的代码，书中的代码虽然变量命名极其混乱，不过逻辑非常清晰，非常适合用作参考代码。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.sys.path.append(os.path.dirname(os.path.abspath('.')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from datasets.dataset import load_breast_cancer\n",
    "data = load_breast_cancer()\n",
    "X, Y = data.data, data.target\n",
    "Y[Y == 0] = -1\n",
    "del data\n",
    "\n",
    "from model_selection.train_test_split import train_test_split\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型基础\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def select_lambda(a,n_samples):\n",
    "#     '''\n",
    "#     选取第二个lambda\n",
    "#     '''\n",
    "#     b=a\n",
    "#     while b==a:\n",
    "#         b=np.random.randint(n_samples)\n",
    "#     return b\n",
    "\n",
    "def clip_lambda(lambda_b,L,H):\n",
    "    '''\n",
    "    对lambda的截断\n",
    "    '''\n",
    "    if lambda_b<L:\n",
    "        lambda_b=L\n",
    "    if lambda_b>H:\n",
    "        lambda_b=H\n",
    "    return lambda_b"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "以下代码逻辑请参阅博客[SMO]()。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# def SMO_simp(X_train, Y_train, C, tol, max_iter):\n",
    "#     '''\n",
    "#     简化版SMO，使用遍历的方式来选取第一个lambda，再随机选取第二个lambda\n",
    "#     '''\n",
    "#     n_samples, n_feature = X_train.shape\n",
    "\n",
    "#     lambdas = np.zeros(n_samples)    # 每一样本都对应一个lambda\n",
    "#     b = 0\n",
    "#     iter_cnt = 0\n",
    "\n",
    "#     while iter_cnt < max_iter:\n",
    "#         lambda_changed_cnt = 0\n",
    "\n",
    "#         # 以遍历的方式来选取第一个lambda\n",
    "#         for i in range(n_samples):  \n",
    "#             # 第一个lambda对应的预测值与误差\n",
    "#             y_i = np.sum((lambdas*Y_train)*np.dot(X_train, X_train[i, :].T))+b\n",
    "#             err_i = y_i-Y_train[i]\n",
    "\n",
    "#             if (Y_train[i]*err_i < -tol and lambdas[i] < C) or (Y_train[i]*err_i > tol and lambdas[i] > 0):\n",
    "#                 j = select_lambda(i, n_samples)    # 选择第二个lambda\n",
    "#                 # 第二个lambda对应的预测值与误差\n",
    "#                 y_j = np.dot((lambdas*Y_train).T,\n",
    "#                              np.dot(X_train, X_train[j, :].T))+b\n",
    "#                 err_j = y_j-Y_train[j]\n",
    "\n",
    "#                 lambda_i_pre, lambda_j_pre = lambdas[i].copy(\n",
    "#                 ), lambdas[j].copy()    # 保存两lambda的旧值\n",
    "\n",
    "#                 # 计算上下界\n",
    "#                 if Y_train[i] != Y_train[j]:\n",
    "#                     L = max(0, lambdas[j]-lambdas[i])\n",
    "#                     H = min(C, C+lambdas[j]-lambdas[i])\n",
    "#                 else:\n",
    "#                     L = max(0, lambdas[j]+lambdas[i]-C)\n",
    "#                     H = min(C, lambdas[j]+lambdas[i])\n",
    "#                 if L == H:\n",
    "#                     continue\n",
    "\n",
    "#                 eta = 2*np.dot(X_train[i, :], X_train[j, :].T)-np.dot(\n",
    "#                     X_train[i, :], X_train[i, :].T)-np.dot(X_train[j, :], X_train[j, :].T)\n",
    "#                 if eta >= 0:\n",
    "#                     continue\n",
    "\n",
    "#                 # 优化lambda_j\n",
    "#                 lambdas[j] -= Y_train[j]*(err_i-err_j)/eta\n",
    "#                 lambdas[j] = clip_lambda(lambdas[j], L, H)\n",
    "#                 delta_lambda_j = lambdas[j]-lambda_j_pre\n",
    "#                 if abs(delta_lambda_j) < 1e-5:    # 更新量太小则放弃\n",
    "#                     continue\n",
    "\n",
    "#                 # 优化lambda_i\n",
    "#                 lambdas[i] -= Y_train[i]*Y_train[j]*delta_lambda_j\n",
    "#                 delta_lambda_i = lambdas[i]-lambda_i_pre\n",
    "\n",
    "#                 # 偏移量\n",
    "#                 b_i = b-err_i-Y_train[i]*delta_lambda_i*np.dot(\n",
    "#                     X_train[i, :], X_train[i, :].T)-Y_train[j]*delta_lambda_j*np.dot(X_train[i, :], X_train[j, :].T)\n",
    "#                 b_j = b-err_j-Y_train[i]*delta_lambda_i*np.dot(\n",
    "#                     X_train[i, :], X_train[j, :].T)-Y_train[j]*delta_lambda_j*np.dot(X_train[j, :], X_train[j, :].T)\n",
    "#                 if 0 < lambdas[i] < C:\n",
    "#                     b = b_i\n",
    "#                 elif 0 < lambdas[j] < C:\n",
    "#                     b = b_j\n",
    "#                 else:\n",
    "#                     b = (b_i+b_j)/2\n",
    "\n",
    "#                 lambda_changed_cnt += 1\n",
    "\n",
    "#         if lambda_changed_cnt == 0:\n",
    "#             iter_cnt += 1\n",
    "#         else:\n",
    "#             iter_cnt = 0\n",
    "\n",
    "#     return lambdas, b\n",
    "\n",
    "\n",
    "# lambdas, b=SMO_simp(X_train, Y_train, 0.6, 0.001, 5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 优化\n",
    "\n",
    "以上代码就是 *'Machine Learning in Action'* 一书中简化版SMO的代码，相信参照博客内容应该很容易读懂。虽然上述实现符合朴素的SMO思想，但是作为代码实现，有很多可以优化的空间。\n",
    "\n",
    "首先是$\\lambda$的选取，上述简化版本的SMO算法是始终采用遍历方式来选取第一个$\\lambda_{a}$，然后再随机选取第二个$\\lambda_{b}$。实际上，有一种启发式的方法可以每次选出比较好的一对$\\lambda$来做优化，大致思想如下：\n",
    "1. 初始化所有$\\lambda$为0，并且同简化版SMO一样，遍历所有$\\lambda_{a}$，再随机选取$\\lambda_{b}$，对所有可以优化的参数对做一轮优化\n",
    "2. 选取满足$0<\\lambda_{a}<C$的参数作为第一个$\\lambda_{a}$，满足此条件$\\lambda$对应的样本称为非边界样本；而对第二个参数，选取满足$max(E_{a}-E_{b})$的参数作为$\\lambda_{b}$\n",
    "3. 不断重复第$2$步，直到不存在非边界样本，然后再对所有$\\lambda$遍历优化，再进入第$2$步\n",
    "\n",
    "首先，定义一个RBF核函数，注意到优化后SVM的预测函数为：\n",
    "$$\n",
    "\\begin{aligned}\n",
    "\\hat{y}&=\n",
    "\\left\\{\n",
    "\\begin{aligned}\n",
    "&+1,  &\\sum_{i=1}^{m}\\lambda_{i}^{*}y^{i}\\langle\\hat{x},x^{i}\\rangle\\ge+1\\\\\n",
    "&-1,  &\\sum_{i=1}^{m}\\lambda_{i}^{*}y^{i}\\langle\\hat{x},x^{i}\\rangle\\le+1\\\\\n",
    "\\end{aligned}\n",
    "\\right.\n",
    "\\end{aligned}\n",
    "$$\n",
    "其中$\\hat{x}$为测试样本，一般我们会以矩阵形式输入，而$x^{i}$为训练样本，这里的累加不知道可不可以使用numpy加速，我暂时没有想到怎么去实现。所以需要定义一个接受矩阵与向量为参数，返回RBF核矩阵的函数。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def rbf(mat, array, sigma=1):\n",
    "    '''\n",
    "    RBF核变换函数，返回mat与array的RBF核矩阵\n",
    "    '''\n",
    "    n_samples, n_feature = mat.shape\n",
    "    K_mat = np.zeros(n_samples)\n",
    "\n",
    "    for i in range(n_samples):\n",
    "        K_mat[i] = np.dot((mat[i, :]-array), (mat[i, :]-array).T)\n",
    "    K_mat = np.exp(-K_mat/2/(sigma**2))\n",
    "    return K_mat\n",
    "\n",
    "# rbf(X_test,X_train[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "为了编程便利，同时也是书上的建议，定义一个数据结构类，该类包含了SVM所需要的参数以及数据缓存。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "class svm_data:\n",
    "    def __init__(self, X_train, Y_train, C, tol):\n",
    "        self.X_train = X_train\n",
    "        self.Y_train = Y_train\n",
    "        self.C = C\n",
    "        self.tol = tol\n",
    "        self.lambdas = np.zeros(len(self.X_train))\n",
    "        self.b = 0\n",
    "        self.err = np.zeros((len(self.X_train), 2))    # 误差缓存，首列为有效位\n",
    "        self.K_mat = np.zeros((len(X_train), len(X_train)))    # 预计算的核矩阵\n",
    "        for i in range(len(X_train)):\n",
    "            self.K_mat[:, i] = rbf(self.X_train, self.X_train[i, :])    # X_train自身对自身的核矩阵"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "接下来定义几个辅助函数。首先是误差计算函数，在上述简化代码中可以看到，第$i$个样本的预测值并没有什么实质意义，只是用于误差```err_i```的计算；然后就是第二个参数$\\lambda_{b}$的选取函数，需要计算与```err_a```差距最大的那一个。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_kth_err(DS, k):\n",
    "    '''\n",
    "    计算err_k\n",
    "    DS: costumize data_structure\n",
    "    k: k_th lambda\n",
    "    '''\n",
    "    y_pred = np.dot((DS.lambdas*DS.Y_train).T, DS.K_mat[k, :])+DS.b\n",
    "    return y_pred-DS.Y_train[k]\n",
    "\n",
    "\n",
    "def update_kth_err(DS, k):\n",
    "    err_k = compute_kth_err(DS, k)\n",
    "    DS.err[k] = [1, err_k]\n",
    "\n",
    "\n",
    "def select_2nd_lambda(i, DS, err_i):\n",
    "    '''\n",
    "    i: 第一个lambda的下标\n",
    "    DS: \n",
    "    err_i: 第一个lambda对应的误差\n",
    "    '''\n",
    "    max_delte_err = 0\n",
    "    err_j = 0\n",
    "    j = -1    # 默认j为最后一个\n",
    "\n",
    "    DS.err[i] = [1, err_i]\n",
    "    valid_err_idxs = np.nonzero(DS.err[:, 0])[0]    # 标记位非零的idxs\n",
    "\n",
    "    if len(valid_err_idxs) > 1:    # 如果已有误差缓存\n",
    "        for idx in valid_err_idxs:\n",
    "            if idx == i:\n",
    "                continue\n",
    "            else:\n",
    "                err_k = compute_kth_err(DS, idx)\n",
    "                delta_err = abs(err_i-err_k)\n",
    "                if delta_err > max_delte_err:\n",
    "                    j = idx\n",
    "                    err_j = err_k\n",
    "                    max_delte_err = delta_err\n",
    "        return j, err_j\n",
    "\n",
    "    else:    # 如果没有误差缓存，则只能随机选第二个lambda\n",
    "        j = i\n",
    "        n_samples = len(DS.X_train)\n",
    "        while j == i:\n",
    "            j = np.random.randint(n_samples)\n",
    "        return j, compute_kth_err(DS, j)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "下面是一个成对优化$\\lambda$的函数，给定第一个参数$\\lambda_{a}$，它会自动选取第二个参数$\\lambda_{b}$并同时优化它们。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def optimize_lambda(i, DS):\n",
    "    '''\n",
    "    成对优化lambda的函数，返回值为表示是否优化的布尔值\n",
    "    '''\n",
    "    err_i = compute_kth_err(DS, i)\n",
    "    if (DS.Y_train[i]*err_i < -DS.tol and DS.lambdas[i] < DS.C) or (DS.Y_train[i]*err_i > DS.tol and DS.lambdas[i] > 0):\n",
    "        j, err_j = select_2nd_lambda(i, DS, err_i)    # 以优化方式选取第二个lambda\n",
    "        lambda_i_pre, lambda_j_pre = DS.lambdas[i].copy(\n",
    "        ), DS.lambdas[j].copy()    # 保存两lambda的旧值\n",
    "\n",
    "        # 计算上下界\n",
    "        if DS.Y_train[i] != DS.Y_train[j]:\n",
    "            L = max(0, DS.lambdas[j]-DS.lambdas[i])\n",
    "            H = min(DS.C, DS.C+DS.lambdas[j]-DS.lambdas[i])\n",
    "        else:\n",
    "            L = max(0, DS.lambdas[j]+DS.lambdas[i]-DS.C)\n",
    "            H = min(DS.C, DS.lambdas[j]+DS.lambdas[i])\n",
    "        if L == H:\n",
    "            return 0\n",
    "\n",
    "        eta = 2*DS.K_mat[i,j]-DS.K_mat[i,i]-DS.K_mat[j,j]\n",
    "        if eta >= 0:\n",
    "            return 0\n",
    "\n",
    "        # 优化lambda_j\n",
    "        DS.lambdas[j] -= DS.Y_train[j]*(err_i-err_j)/eta\n",
    "        DS.lambdas[j] = clip_lambda(DS.lambdas[j], L, H)\n",
    "        update_kth_err(DS, j)    # 更新缓存\n",
    "        delta_lambda_j = DS.lambdas[j]-lambda_j_pre\n",
    "        if abs(delta_lambda_j) < 1e-5:    # 更新量太小则放弃\n",
    "            return 0\n",
    "\n",
    "        # 优化lambda_i\n",
    "        DS.lambdas[i] -= DS.Y_train[i]*DS.Y_train[j]*delta_lambda_j\n",
    "        update_kth_err(DS, i)    # 更新缓存\n",
    "        delta_lambda_i = DS.lambdas[i]-lambda_i_pre\n",
    "\n",
    "        # 偏移量\n",
    "        b_i = DS.b-err_i-DS.Y_train[i]*delta_lambda_i*DS.K_mat[i,i]-DS.Y_train[j]*delta_lambda_j*DS.K_mat[i,j]\n",
    "        b_j = DS.b-err_j-DS.Y_train[i]*delta_lambda_i*DS.K_mat[i,j]-DS.Y_train[j]*delta_lambda_j*DS.K_mat[j,j]\n",
    "        if 0 < DS.lambdas[i] < DS.C:\n",
    "            DS.b = b_i\n",
    "        elif 0 < DS.lambdas[j] < DS.C:\n",
    "            DS.b = b_j\n",
    "        else:\n",
    "            DS.b = (b_i+b_j)/2\n",
    "        \n",
    "        return 1\n",
    "    return 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "def SMO(X_train,Y_train,C,tol,max_iter):\n",
    "    n_samples,n_feature=X_train.shape\n",
    "    DS=svm_data(X_train,Y_train,C,tol)\n",
    "    iter_cnt=0\n",
    "    full_set_flag=True    # 遍历所有lambda的标记位，第一次循环或找不到非边界样本时设为True\n",
    "    lambda_changed_cnt=0\n",
    "    \n",
    "    while iter_cnt<max_iter and (lambda_changed_cnt>0 or full_set_flag):\n",
    "        lambda_changed_cnt=0\n",
    "        if full_set_flag:\n",
    "            for i in range(n_samples):    # 遍历选取第一个lambda\n",
    "                tmp=optimize_lambda(i,DS)\n",
    "                lambda_changed_cnt+=tmp\n",
    "            iter_cnt+=1\n",
    "        else:    # 存在非边界样本时\n",
    "            non_bound_idxs=np.nonzero((DS.lambdas>0)*(DS.lambdas<C))[0]    # 非边界样本，lambda大于0小于C\n",
    "            for idx in non_bound_idxs:    # 在非边界样本中选取第一个lambda\n",
    "                tmp=optimize_lambda(idx,DS)\n",
    "                lambda_changed_cnt+=tmp\n",
    "            iter_cnt+=1\n",
    "        \n",
    "        if full_set_flag:    # 如果上轮以遍历方式选取第一个lambda，则下次不使用遍历方式\n",
    "            full_set_flag=False    \n",
    "        elif lambda_changed_cnt==0:    # 如果一轮迭代没有做出优化，则下次尝试使用遍历方式\n",
    "            full_set_flag=True\n",
    "            \n",
    "    return DS.lambdas,DS.b\n",
    "\n",
    "lambdas,b=SMO(X_train,Y_train, 0.6, 0.001, 50)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "训练完毕的SVM预测函数为：\n",
    "$$\n",
    "\\begin{aligned}\n",
    "\\hat{y}&=\n",
    "\\left\\{\n",
    "\\begin{aligned}\n",
    "&+1,  &\\sum_{i=1}^{m}\\lambda_{i}^{*}y^{i}\\langle\\hat{x},x^{i}\\rangle\\ge+1\\\\\n",
    "&-1,  &\\sum_{i=1}^{m}\\lambda_{i}^{*}y^{i}\\langle\\hat{x},x^{i}\\rangle\\le+1\\\\\n",
    "\\end{aligned}\n",
    "\\right.\n",
    "\\end{aligned}\n",
    "$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "acc:0.7631578947368421\n"
     ]
    }
   ],
   "source": [
    "def predict(X_test,lambdas,X_train,Y_train):\n",
    "    Y_pred=0\n",
    "    for i in range(len(X_train)):\n",
    "        Y_pred+=lambdas[i]*Y_train[i]*rbf(X_test,X_train[i,:])\n",
    "    Y_pred+=b\n",
    "    Y_pred[Y_pred<0]=-1\n",
    "    Y_pred[Y_pred>0]=1\n",
    "    return Y_pred\n",
    "\n",
    "Y_pred=predict(X_test,lambdas,X_train,Y_train)\n",
    "print('acc:{}'.format(np.sum(Y_pred==Y_test)/len(Y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sklearn acc:0.7017543859649122\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\qq435\\appdata\\local\\programs\\python\\python36\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n",
      "  \"avoid this warning.\", FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.svm import SVC\n",
    "svc=SVC()\n",
    "svc.fit(X_train,Y_train)\n",
    "Y_pred=svc.predict(X_test)\n",
    "print('sklearn acc:{}'.format(np.sum(Y_pred==Y_test)/len(Y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
